# No encoding supplied: defaulting to UTF-8 ?
# R Options
options(stringsAsFactors=FALSE,
citation_format="pandoc",
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
kableExtra_view_html=TRUE,
future.globals.maxSize=2000000000, mc.cores=4,
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
# Python3 needed for clustering, umap, other python packages
# Path to binary will be automatically found
# Set manually if it does not work
reticulate_python3_path = unname(Sys.which("python3"))
Sys.setenv(RETICULATE_PYTHON = reticulate_python3_path)
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix, sctransform
# Tables: kableExtra, DT
# Plots: ggsci, ggpubr
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source='fold-hide', # by default collapse code blocks
dev=c('png', 'pdf'), # create figures in png and pdf; the first device (png) will be used for HTML output
dev.args=list(png=list(type="cairo"), # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
pdf=list(bg="white")), # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
dpi=96 # figure resolution
)This code chunk contains all parameters that are set specifically for the project.
param = list()
####################
# Input parameters #
####################
# Project ID
param$project_id = "pbmc"
# Path to input data
param$path_data = data.frame(name=c("pbmc_10x","pbmc_smartseq2"),
type=c("10x","smartseq2"),
path=c("test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/", "test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz"),
stats=c(NA, NA))
#param$path_data = data.frame(name="pbmc_10x",
# type="10x",
# path="test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/",
# stats=NA)
# Downsample data to at most n cells per sample (mainly for tests)
# NULL to deactivate
param$downsample_cells_n = NULL
# Path to output directory
param$path_out = "test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/"
# Marker genes based on literature, translated to Ensembl IDs
# xlsx file, one list per column, first row as header and Ensembl IDs below
# NULL if no known marker genes should be plotted
param$file_known_markers = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/known_markers.xlsx"
# Annotation via biomaRt
param$mart_dataset = "hsapiens_gene_ensembl"
param$annot_version = 98
param$annot_main = c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
param$mart_attributes = c(param$annot_main,
c("chromosome_name", "start_position", "end_position",
"percentage_gene_gc_content", "gene_biotype", "strand", "description"))
param$biomart_mirror = NULL
# Alternatively, we can read a previously compiled annotation table from file
param$file_annot = NULL
# Prefix for mitochondrial genes
param$mt = "^MT-"
#####################
# Filter parameters #
#####################
# Filter for cells
param$cell_filter = list(nFeature_RNA=c(200, NA), percent_mt=c(NA, 20))
# Filter for features
param$feature_filter = list(min_counts=1, min_cells=3) # feature has to be found by at least one count in one cell
# Samples to drop
# Cells from these samples will be dropped after initial QC
# Example: param$samples_to_drop = c("pbmc_smartseq2_NC", "pbmc_smartseq2_RNA"),
# where "pbmc_smartseq2" is the name of the dataset, and "NC" and "RNA" are the names of the subsamples
param$samples_to_drop = c()
# Drop samples with too few cells
param$samples_min_cells = 10
############################
# Normalisation parameters #
############################
# Which normalisation should be used for analysis?
# "RNA" or "SCT"
param$norm = "RNA"
# Whether or not to remove cell cycle effects
param$cc_remove = FALSE
# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param$cc_remove_all = FALSE
# Whether or not to re-score cell cycle effects after data
# from different samples have been merged
param$cc_rescore_after_merge = TRUE
# Additional (unwanted) variables that will be regressed out for visualisation and clustering
param$vars_to_regress = c()
# When there are multiple datasets, how to combine them:
# - method:
# - "single": Default when there is only one dataset after filtering, no integration is needed
# - "merge": Merge (in other words, concatenate) data when no integration is needed, e.g. when samples were multiplexed on the same chip
# - "standard": Integration: Anchors are computed for all pairs of datasets
# This will give all datasets the same weight during dataset integration but can be computationally intensive
# - "reference": Integration: One dataset is used as reference and anchors are computed for all other datasets
# Less accurate but computationally faster
# Not implemented yet
# - "reciprocal": Integration: Anchors are computed in PCA space instead of the data
# Even less accurate but for very big datasets
# - reference_dataset: When using method="reference", which dataset is the reference?
# Can be numeric or name of the dataset
# - dimensions: Number of dimensions to consider for integration
param$integrate_samples = list(method="standard", reference_dataset=1, dimensions=30)
# The number of PCs to use; adjust this parameter based on the Elbowplot
param$pc_n = 10
# Resolution of clusters; low values will lead to fewer clusters of cells
param$cluster_resolution=0.5
#######################################################
# Marker genes and genes with differential expression #
#######################################################
# Thresholds to define marker genes
param$marker_padj = 0.05
param$marker_log2FC = log2(2)
param$marker_pct = 0.25
# Additional (unwanted) variables to account for in statistical tests
param$latent_vars = c()
# Contrasts to find differentially expressed genes (R data.frame or Excel file)
# Required columns:
# condition_column: Categorial column in the cell metadata; specify "orig.ident" for sample and "seurat_clusters" for cluster
# condition_group1: Condition levels in group 1, multiple levels concatenated by the plus character
# Empty string = all levels not in group2 (cannot be used if group2 is empty)
# condition_group2: Condition levels in group 2, multiple levels concatenated by the plus character
# Empty string = all levels not in group1 (cannot be used if group1 is empty)
#
# Optional columns:
# subset_column: Categorial column in the cell metadata to subset before testing (default: NA)
# Specify "orig.ident" for sample and "seurat_clusters" for cluster
# subset_group: Further subset levels (default: NA)
# For the individual analysis of multiple levels separate by semicolons
# For the joint analysis of multiple levels concatenate by the plus character
# For the individual analysis of all levels empty string ""
# assay: Seurat assay to test on; can also be a Seurat dimensionality reduction (default: "RNA")
# slot: In case assay is a Seurat assay object, which slot to use (default: "data")
# padj: Maximum adjusted p-value (default: 0.05)
# log2FC: Minimum absolute log2 fold change (default: 0)
# min_pct: Minimum percentage of cells expressing a gene to test (default: 0.1)
# test: Type of test; "wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST", "DESeq2"; (default: "wilcox")
# latent_vars: Additional variables to account for; multiple variables need to be concatenated by semicolons; will overwrite the default by param$latent_vars (default: none).
param$deg_contrasts = data.frame(condition_column=c("orig.ident", "orig.ident", "Phase"),
condition_group1=c("pbmc_10x", "pbmc_10x", "G1"),
condition_group2=c("pbmc_smartseq2_sample1", "pbmc_smartseq2_sample1", "G2M"),
subset_column=c(NA, "seurat_clusters", "seurat_clusters"),
subset_group=c(NA, "", "1;2"))
# P-value threshold for functional enrichment tests
param$enrichr_padj = 0.05
# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")
######################
# General parameters #
######################
# Main colour to use for plots
param$col = "palevioletred"
# Colour palette and colours used for samples
param$col_palette_samples = "ggsci::pal_jama"
# Colour palette and colours used for cluster
param$col_palette_clusters = "ggsci::pal_startrek"
# Path to git repository
param$path_to_git = "."
# Debugging mode:
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
param$debugging_mode = "default_debugging"# Path for figures in png and pdf format
knitr::opts_chunk$set(fig.path=paste(param$path_out, "figures/", sep="/"))
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("R/functions_io.R",
"R/functions_plotting.R",
"R/functions_analysis.R",
"R/functions_degs.R",
"R/functions_util.R")
git_files_to_source = paste(param$path_to_git, git_files_to_source, sep="/")
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:",paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
# Debugging mode:
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())
# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
# Create output directories
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
#dir.create(paste(param$path_out, "images", sep="/"), recursive=TRUE, showWarnings=FALSE)
#dir.create(paste(param$path_out, "marker", sep="/"), recursive=TRUE, showWarnings=FALSE)
#dir.create(paste(param$path_out, "degs", sep="/"), recursive=TRUE, showWarnings=FALSE)
# Do checks
error_messages = c()
# Check installed packages
error_messages = c(error_messages, check_installed_packages())
# Check python
error_messages = c(error_messages, check_python())
# Check parameters
error_messages = c(error_messages, check_parameters(param))
# Check enrichR
error_messages = c(error_messages, check_enrichr(param$enrichr_dbs))
# Check ensembl
error_messages = c(error_messages, check_ensembl(biomart="ensembl",
dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version,
attributes=param$mart_attributes))We begin by printing mapping statistics that have been produced prior to this workflow.
# Are statistics provided?
if (!all(is.na(param$path_data$stats))) {
# Loop through all samples and read mapping stats
mapping_stats_list = list()
for (i in 1:nrow(param$path_data)) {
if (!is.na(param$path_data$stats[i])) {
mapping_stats_list[[param$path_data$name[i]]] = read.delim(param$path_data$stats[i],
sep=",", header=FALSE, check.names=FALSE) %>%
t() %>% as.data.frame()
}
}
# Join all mapping stats tables
mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="V1")
rownames(mapping_stats) = mapping_stats[["V1"]]
mapping_stats = mapping_stats %>% dplyr::select(-V1)
colnames(mapping_stats) = names(mapping_stats_list)
# Print table to HTML
knitr::kable(mapping_stats, align="l", caption="Mapping statistics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}× (Message)
Mapping statistics cannot be shown. No valid file provided.
If not provided already, we read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat rownames.
# If not provided by user, save annotation in the path_out directory
if (is.null(param$file_annot)) {
param$file_annot = file.path(param$path_out, paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
}
# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
annot_ensembl = read.delim(param$file_annot)
} else {
annot_mart = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version))
annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes, useCache=FALSE)
write.table(annot_ensembl, file=param$file_annot, sep='\t', col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]
# Ensembl id to gene symbol
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])
# Ensembl id to seurat-compatible unique rowname
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])
# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
# Gene symbol to ensembl id: named LIST to account for genes where one symbol translates to multiple Ensembl IDs
symbol_to_ensembl_df = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_ensembl = split(symbol_to_ensembl_df[, ensembl], symbol_to_ensembl_df[, symbol])
# Gene symbol to (seurat compatible unique) gene symbol: named LIST to account for genes with multiple names
symbol_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_seurat_rowname$seurat_rowname = ensembl_to_seurat_rowname[symbol_to_seurat_rowname[, ensembl]]
symbol_to_seurat_rowname = split(symbol_to_seurat_rowname$seurat_rowname, symbol_to_seurat_rowname[, symbol])
# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])
# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})
# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
cc_genes_marker_file = paste0(param$path_out, "/cell_cycle_markers.xlsx")
if (file.exists(cc_genes_marker_file)) {
# Load from file
genes_s = openxlsx::read.xlsx(cc_genes_marker_file, sheet=1)
genes_g2m = openxlsx::read.xlsx(cc_genes_marker_file, sheet=2)
} else {
# Obtain from Ensembl
# Note: both mart objects must point to the same mirror for biomarT::getLDS to work
mart_human = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset="hsapiens_gene_ensembl",
mirror=param$biomart_mirror,
version=param$annot_version))
mart_myspecies = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=GetBiomaRtMirror(mart_human),
version=param$annot_version))
# S phase marker
genes_s = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_s) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
# G2/M marker
genes_g2m = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_g2m) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
# Write to file
openxlsx::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m),file=cc_genes_marker_file)
}
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))We next read the scRNA-seq dataset(s) into Seurat.
# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i,"name"]
type = datasets[i,"type"]
path = datasets[i,"path"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc = c(sc, ReadSparseMatrix(path,
project=name,
row_name_column=1,
convert_row_names=ensembl_to_seurat_rowname))
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE))
}
}
# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]],1), sample_names[i], sep=".")
sc[[i]][["orig.ident"]] = sample_names[i]
}
# Make cell names unique
sc = purrr::map(list_indices(sc), function(i){
cell_names = gsub("-\\d+", "", colnames(sc[[i]]))
Seurat::RenameCells(sc[[i]], new.names=paste(cell_names, i, sep="-"))
})
# Set up colors for samples
sample_names = purrr::flatten_chr(purrr::map(sc, function(s){ unique(as.character(s[[]][["orig.ident"]])) }))
param$col_samples = GenerateColours(num_colours=length(sample_names), palette=param$col_palette_samples)
names(param$col_samples) = sample_names
# Downsample cells if requested
if (!is.null(param$downsample_cells_n)) {
sc = purrr::map(sc, function(s) {
cells = ScSampleCells(sc=s, n=param$downsample_cells_n)
return(subset(s, cells=cells))
})
}
sc## $pbmc_10x
## An object of class Seurat
## 33694 features across 4033 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## $pbmc_smartseq2_sample1
## An object of class Seurat
## 33694 features across 311 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
The following first table shows metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), or the above mentioned number of unique genes detected (“nFeature_RNA”). The second table shows metadata (columns) of the first 5 genes (rows).
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| orig.ident | nCount_RNA | nFeature_RNA | orig.dataset | SampleName | PlateNumber | PlateRow | PlateCol | |
|---|---|---|---|---|---|---|---|---|
| PBMC1_10x_AAACCCACACTTGGGC-1 | pbmc_10x | 7552 | 2037 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_AAACCCACAGGTGGAT-1 | pbmc_10x | 4773 | 1800 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_AAACCCAGTGCTTATG-1 | pbmc_10x | 4430 | 1565 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_AAACCCATCCGTAGTA-1 | pbmc_10x | 4512 | 1592 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_AAACCCATCTTACACT-1 | pbmc_10x | 6663 | 1919 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_AAACGAAGTCTAGTGT-1 | pbmc_10x | 143 | 89 | pbmc_10x | NA | NA | NA | NA |
# Print gene metadata
knitr::kable(head(sc[[1]][["RNA"]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| feature_id | feature_name | feature_type | |
|---|---|---|---|
| TSPAN6 | ENSG00000000003 | TSPAN6 | Gene Expression |
| TNMD | ENSG00000000005 | TNMD | Gene Expression |
| DPM1 | ENSG00000000419 | DPM1 | Gene Expression |
| SCYL3 | ENSG00000000457 | SCYL3 | Gene Expression |
| C1orf112 | ENSG00000000460 | C1orf112 | Gene Expression |
We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, Seurat::PercentageFeatureSet, pattern=param$mt, col.name="percent_mt", assay="RNA")
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine (again) cell metadata of the Seurat objects into one big metadata, this time including mt and ercc
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
# This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
# Calculate percentage of counts per gene in a cell
counts_rna = Seurat::GetAssayData(sc[[n]], slot="counts", assay="RNA")
total_counts = sc[[n]][["nCount_RNA", drop=TRUE]]
counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
# Calculate feature filters
num_cells_expr = Matrix::rowSums(counts_rna >= 1)
num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
# Calculate median of counts_rna_perc per gene
counts_median = apply(counts_rna_perc, 1, median)
# Add all QC measures as metadata
sc[[n]][["RNA"]] = Seurat::AddMetaData(sc[[n]][["RNA"]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
return(sc[[n]])
})# Plot QC metrics for cells
cell_qc_features = c("nFeature_RNA", "nCount_RNA", "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
p_list = list()
for (i in names(cell_qc_features)) {
p_list[[i]]= ggplot(sc_cell_metadata[, c("orig.ident", i)], aes_string(x="orig.ident", y=i, fill="orig.ident")) +
geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
p_list[[i]] = p_list[[i]] +
geom_point(data=sc_cell_metadata[, c("orig.ident", i)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes_string(x="orig.ident", y=i, fill="orig.ident"), shape=21, size=2)
# Now add styles
p_list[[i]] = p_list[[i]] +
AddStyle(title=i, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Creates a table with min/max values for filter i for each dataset
cell_filter_for_plot = purrr::map_dfr(names(param$cell_filter), function(n) {
# If filter i in cell filter of the dataset, then create dataframe with columns orig.ident, threshold and value
if (i %in% names(param$cell_filter[[n]])){
data.frame(orig.ident=n, threshold=c("min", "max"), value=param$cell_filter[[n]][[i]], stringsAsFactors=FALSE)
}
})
# Add filters as segments to plot
if (nrow(cell_filter_for_plot) > 0) {
# Remove entries that are NA
cell_filter_for_plot = cell_filter_for_plot %>% dplyr::filter(!is.na(value))
p_list[[i]] = p_list[[i]] + geom_segment(data=cell_filter_for_plot,
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value),
lty=2, col="firebrick")
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p_list = list()
p_list[[1]] = ggplot(sc_cell_metadata, aes_string(x=cell_qc_features[2], y=cell_qc_features[1], colour="orig.ident")) +
geom_point() +
AddStyle(col=param$col_samples)
p_list[[2]] = ggplot(sc_cell_metadata, aes_string(x=cell_qc_features[3], y=cell_qc_features[1], colour="orig.ident")) +
geom_point() +
AddStyle(col=param$col_samples)
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Features plotted against each other")
if (length(sc)==1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides = "collect") & theme(legend.position="bottom")
}
pWe next investigate whether there are individual genes that are represented by an unusually high number of counts. For each cell, we first calculate the percentage of counts per gene. Subsequently, for each gene, we calculate the median value of these percentages. Genes with the highest median percentage of counts are plotted below.
# Plot only samples that we intend to keep
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
idx = sc[[i]][["RNA"]][["counts_median"]] %>% order(decreasing=TRUE) %>% head(n=10)
return(rownames(sc[[i]][["RNA"]][[]])[idx])
}) %>%
unlist() %>%
unique()
genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[["RNA"]][["counts_median"]][genes_highestExpr, ])
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)
col = GenerateColours(num_colours=length(genes_highestExpr), palette="ggsci::pal_simpsons")
ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) +
geom_point() +
geom_line() +
AddStyle(title="Top 10 highest expressed genes per sample, added into one list",
xlab="Sample", ylab="Median % of raw counts\n per gene in a cell",
legend_position="bottom",
col=col)Cells and genes are filtered based on the following thresholds:
cell_filter_list = param$cell_filter %>% unlist(recursive=FALSE)
cell_filter_tbl = cell_filter_list %>% purrr::reduce(rbind) %>% as.data.frame()
rownames(cell_filter_tbl) = names(cell_filter_list)
colnames(cell_filter_tbl) = c("Min", "Max")
feature_filter_list = param$feature_filter %>% unlist(recursive=FALSE)
feature_filter_tbl = feature_filter_list %>% purrr::reduce(rbind) %>% as.data.frame()
rownames(feature_filter_tbl) = names(feature_filter_list)
colnames(feature_filter_tbl) = "n"
knitr::kable(cell_filter_tbl, align="l", caption="Filters applied to cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Min | Max | |
|---|---|---|
| pbmc_10x.nFeature_RNA | 200 | NA |
| pbmc_10x.percent_mt | NA | 20 |
| pbmc_smartseq2_sample1.nFeature_RNA | 200 | NA |
| pbmc_smartseq2_sample1.percent_mt | NA | 20 |
knitr::kable(feature_filter_tbl, align="l", caption="Filters applied to genes") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| n | |
|---|---|
| pbmc_10x.min_counts | 1 |
| pbmc_10x.min_cells | 3 |
| pbmc_smartseq2_sample1.min_counts | 1 |
| pbmc_smartseq2_sample1.min_cells | 3 |
The number of excluded cells and features is as follows:
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude = purrr::map(list_names(sc), function(n) {
filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter = param$cell_filter[[n]][[f]]
if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
return(names(which(idx_exclude)))
} else if (is.character(filter)) {
return(names(which(sc[[n]][[f, drop=TRUE]] %in% filter)))
}
})
# Samples to drop
if (n %in% param$samples_to_drop) {
filter_result[["samples_to_drop"]] = colnames(sc[[n]])
} else {
filter_result[["samples_to_drop"]] = as.character(c())
}
# Minimum number of cells for a sample to keep
if (ncol(sc[[n]]) < param$samples_min_cells) {
filter_result[["samples_min_cells"]] = colnames(sc[[n]])
} else {
filter_result[["samples_min_cells"]] = as.character(c())
}
return(filter_result)
})
# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
return(as.data.frame(purrr::map(s, length)))
})
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
knitr::kable(sc_cells_to_exclude_summary, align="l", caption="Number of excluded cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) | nFeature_RNA | percent_mt | samples_to_drop | samples_min_cells | |
|---|---|---|---|---|
| pbmc_10x | 286 | 271 | 0 | 0 |
| pbmc_smartseq2_sample1 | 1 | 0 | 0 | 0 |
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
cells_to_keep = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
if (length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)
if (length(sc)==1) param$integrate_samples[["method"]] = "single"# Only RNA assay at the moment
# Iterate over datasets and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
else return(names(which(sc[[n]][["RNA"]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})
# Summarise
sc_features_to_exclude_summary = purrr::map(sc_features_to_exclude, length) %>%
t() %>% as.data.frame()
rownames(sc_features_to_exclude_summary) = c("Genes")
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Number of excluded genes") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| pbmc_10x | pbmc_smartseq2_sample1 | |
|---|---|---|
| Genes | 13893 | 15754 |
# Now filter
sc = purrr::map(list_names(sc), function(n) {
assay_names = Seurat::Assays(sc[[n]])
features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
features = rownames(sc[[n]][[a]])
keep = features[!features %in% sc_features_to_exclude[[n]]]
return(keep)
})
return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})After filtering, the size of the Seurat object is:
## $pbmc_10x
## An object of class Seurat
## 19801 features across 3608 samples within 1 assay
## Active assay: RNA (19801 features, 0 variable features)
##
## $pbmc_smartseq2_sample1
## An object of class Seurat
## 17940 features across 310 samples within 1 assay
## Active assay: RNA (17940 features, 0 variable features)
In this section, we subsequently run a series of Seurat functions for each provided sample:
1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
2. We assign cell cycle scores to each cell based on its normalised expression of G2/M and S phase markers. These scores are visualised in a separate section further below. If specified in the above parameter section, cell cycle effects are removed during scaling (step 3).
3. Dependent on the normalisation of your choice, we either
3a. Run standard functions to select variable genes, and scale normalised gene counts. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly in others. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score).
3b. Run SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling).
Note that removing all signal associated to cell cycle can negatively impact downstream analysis. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. As an alternative, we can remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences amongst proliferating cells will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat (“Satija Lab” 2020). Cell cycle effects removed for this report: FALSE; all cell cycle effects removed for this report: FALSE.
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# Normalise data the original way
# This is required to score cell cycle
# https://github.com/satijalab/seurat/issues/1679
sc = purrr::map(sc, Seurat::NormalizeData, normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE)# Determine cell cycle effect per sample
sc = purrr::map(list_names(sc), function(n) {
sc[[n]] = cc_scoring(sc=sc[[n]], genes_s=genes_s[,2], genes_g2m=genes_g2m[,2], name=n)
if (all(is.na(sc[[n]][["S.Score"]])) & all(is.na(sc[[n]][["G2M.Score"]]))) param$cc_remove=FALSE
return(sc[[n]])
})
# If cell cycle effects should be removed, we first score cells
# The effect is then removed in the following chunk
if (param$cc_remove) {
# Add to vars that need to regressed out during normalisation
if (param$cc_remove_all) {
# Remove all signal associated to cell cycle
param$vars_to_regress = unique(c(param$vars_to_regress, "S.Score", "G2M.Score"))
param$latent_vars = unique(c(param$latent_vars, "S.Score", "G2M.Score"))
} else {
# Don't remove the difference between cycling and non-cycling cells
param$vars_to_regress = unique(c(param$vars_to_regress, "CC.Difference"))
param$latent_vars = unique(c(param$latent_vars, "CC.Difference"))
}
}if (param$norm == "RNA") {
# Find variable features from normalised data (unaffected by scaling)
sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method = "vst", nfeatures = 3000, verbose=FALSE)
# Scale
# Note: For a single dataset where no integration/merging is needed, all features can already be scaled here.
# Otherwise, scaling of all features will be done after integration/merging.
if (param$integrate_samples[["method"]]=="single") {
sc[[1]] = Seurat::ScaleData(sc[[1]],
features=rownames(sc[[1]][["RNA"]]),
vars.to.regress=param$vars_to_regress,
verbose=FALSE)
}
} else if (param$norm == "SCT") {
# Run SCTransform
#
# This is a new normalisation method that replaces previous Seurat functions 'NormalizeData', 'FindVariableFeatures', and 'ScaleData'.
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# Normalised data end up here: sc@assays$SCT@data
# Note: For a single dataset where no integration is needed, all features can already be scaled here.
# Otherwise, it is enough to scale only the variable features.
# Note: It is not guaranteed that all genes are successfully normalised with SCTransform.
# Consequently, some genes might be missing from the SCT assay.
# See: https://github.com/ChristophH/sctransform/issues/27
sc = purrr::map(list_names(sc), function(n) {
SCTransform(sc[[n]],
vars.to.regress=param$vars_to_regress,
min_cells=param$feature_filter[[n]][["min_cells"]],
verbose=FALSE,
return.only.var.genes=!(param$integrate_samples[["method"]]=="single"))
})
}Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells.
To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in. Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019)). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability. The top 3,000 variable genes are used for further analysis.
p_list = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]], assay=param$norm), 10)
p = Seurat::VariableFeaturePlot(sc[[n]],
assay=param$norm,
selection.method=ifelse(param$norm=="RNA", "vst", "sct"),
col=c("grey", param$col)) +
AddStyle(title=n) +
theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
return(p)
})
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
pif (param$integrate_samples[["method"]]!="single") {
# When merging, feature meta-data is removed by Seurat entirely; save separately for each assay and add again afterwards
assay_names = unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } )))
# Loop through all assays and accumulate meta data
sc_feature_metadata = purrr::map(values_to_names(assay_names), function(a) {
# "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
# This step is skipped for assays that do not contain all three types of feature information
contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) {
all(c("feature_id", "feature_name", "feature_type") %in% colnames(sc[[n]][[a]][[]]))
})
if (all(contains_neccessary_columns)) {
feature_id_name_type = purrr::map(sc, function(s) return(s[[a]][[c("feature_id", "feature_name", "feature_type")]]) )
feature_id_name_type = purrr::reduce(feature_id_name_type, function(df_x, df_y) {
new_rows = which(!rownames(df_y) %in% rownames(df_x))
if (length(new_rows) > 0) return(rbind(df_x, df_y[new_rows, ]))
else return(df_x)
})
feature_id_name_type$row_names = rownames(feature_id_name_type)
} else {
feature_id_name_type = NULL
}
# For all other meta-data, we prefix column names with the dataset
other_feature_data = purrr::map(list_names(sc), function(n) {
df = sc[[n]][[a]][[]]
if (contains_neccessary_columns[[n]]) df = df %>% dplyr::select(-dplyr::one_of(c("feature_id", "feature_name", "feature_type"), c()))
if (ncol(df) > 0) colnames(df) = paste(n, colnames(df), sep=".")
df$row_names = rownames(df)
return(df)
})
# Now join everything by row_names by full outer join
if (!is.null(feature_id_name_type)) {
feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type), other_feature_data), dplyr::full_join, by="row_names")
} else {
feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
}
rownames(feature_data) = feature_data$row_names
feature_data$row_names = NULL
return(feature_data)
})
# When merging, cell metadata are merged but factors are not kept
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
sc_cell_metadata_factor_levels = purrr::map(which(sapply(sc_cell_metadata, is.factor)), function(n) {
return(levels(sc_cell_metadata[, n, drop=TRUE]))
})
}# Standard method for integrating multiple samples.
# Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="standard") {
# Note "Assay names should only have numbers and letters: Warnung: Keys should be one or more alphanumeric characters followed by an underscore, setting key from rna_integrated_ to rnaintegrated_" (seurat/R/object.R)
# The integration step will temporarily occupy a lot of memory.
# However, R has problems with freeing unused memory.
# By wrapping the steps into a function, hopefully this works a bit better.
run_standard_integration = function(sc_objs, ndims=30, vars_to_regress=c(), feature_filter=c(), verbose=FALSE, assay="RNA") {
# How many neighbors to use when filtering anchors; also adjust weight
k.filter = min(200, min(sapply(sc_objs, ncol)))
k.weight = min(100, k.filter)
# Find integration anchors for assay RNA
if (assay == "RNA") {
integrate_RNA_anchors = Seurat::FindIntegrationAnchors(object.list=sc_objs,
dims=1:ndims,
anchor.features=3000,
k.filter=k.filter,
verbose=verbose)
sc_objs = Seurat::IntegrateData(integrate_RNA_anchors,
new.assay.name="RNAintegrated",
dims=1:ndims,
k.weight=k.weight,
verbose=verbose)
# According to Seurat, we need to scale data again for "RNAintegrated", and "RNA"
sc_objs = Seurat::ScaleData(sc_objs,
features=rownames(sc_objs[["RNAintegrated"]]),
vars.to.regress=vars_to_regress,
assay="RNAintegrated",
verbose=verbose)
DefaultAssay(sc_objs) = "RNA"
sc_objs = Seurat::ScaleData(sc_objs,
features=rownames(sc_objs[["RNA"]]),
vars.to.regress=vars_to_regress,
assay="RNA",
verbose=verbose)
rm(integrate_RNA_anchors)
} else if (assay == "SCT") {
# Find integration anchors for assay SCT
integrate_SCT_features = SelectIntegrationFeatures(object.list=sc_objs,
nfeatures=3000,
verbose=verbose)
sc_objs = PrepSCTIntegration(object.list=sc_objs,
anchor.features=integrate_SCT_features,
assay=rep("SCT",length(sc_objs)),
verbose=verbose)
integrate_SCT_anchors = FindIntegrationAnchors(object.list=sc_objs,
dims=1:ndims,
normalization.method="SCT",
anchor.features=integrate_SCT_features,
k.filter=k.filter,
verbose=verbose)
sc_objs = Seurat::IntegrateData(integrate_SCT_anchors,
new.assay.name="SCTintegrated",
normalization.method="SCT",
dims=1:ndims,
k.weight=k.weight,
verbose=verbose)
# We need to re-run SCTransform for the "SCT" assay again, to normalise on the complete dataset
DefaultAssay(sc_objs) = "SCT"
min_cells_overall = max(purrr::map_int(feature_filter, function(f) as.integer(f[["min_cells"]])))
sc_objs = SCTransform(sc_objs,
vars.to.regress=vars_to_regress,
min_cells=min_cells_overall,
verbose=FALSE,
return.only.var.genes=FALSE)
rm(integrate_SCT_features, integrate_SCT_anchors)
}
# Call garbage collector to free memory (hope it helps)
gc(verbose=verbose)
return(sc_objs)
}
# call function
sc = run_standard_integration(sc, ndims=param$integrate_samples[["dimensions"]], vars_to_regress=param$vars_to_regress, feature_filter=param$feature_filter, assay=param$norm)
# Add feature metadata
for (a in Seurat::Assays(sc)) {
if (a %in% names(sc_feature_metadata)) {
sc[[a]] = Seurat::AddMetaData(sc[[a]], sc_feature_metadata[[a]][rownames(sc[[a]]),, drop=FALSE])
}
}
# Fix cell metadata factors
for (f in names(sc_cell_metadata_factor_levels)) {
sc[[f]] = factor(sc[[f, drop=TRUE]], levels=sc_cell_metadata_factor_levels[[f]])
}
# Set default assay (will be the integrated version)
DefaultAssay(sc) = paste0(param$norm, "integrated")
message("Data values for all samples have been integrated.")
print(sc)
}× (Message)
Data values for all samples have been integrated.
## An object of class Seurat
## 24312 features across 3918 samples within 2 assays
## Active assay: RNAintegrated (3000 features, 3000 variable features)
## 1 other assay present: RNA
n_cells_rle_plot = min(100, sc[["orig.ident"]] %>% table() %>% min())
# Sample at most 100 cells per dataset and save their identity
cells_RLE_subset = sc[["orig.ident"]] %>% tibble::rownames_to_column() %>%
dplyr::group_by(orig.ident) %>%
dplyr::sample_n(size=n_cells_rle_plot) %>%
dplyr::select(rowname, orig.ident)
cells_RLE_sample = cells_RLE_subset %>% dplyr::pull(orig.ident)
cells_RLE_subset = cells_RLE_subset %>% dplyr::pull(rowname)To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.
For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#374E55FF, #DF8F44FF)
# Get counts from assay RNA
data_RLE_raw = GetAssayData(subset(sc, cells=cells_RLE_subset), assay="RNA", slot="counts")
# Plot
p = PlotRLE(as.matrix(log2(data_RLE_raw + 1)), id=cells_RLE_sample, col=param$col_samples) + labs(title="log2(raw counts + 1)")
pDependent on the context, this tab refers to different data:
# Get normalised data before integration
data_RLE_norm = GetAssayData(subset(sc, cells=cells_RLE_subset), assay=param$norm, slot="data")
# Plot
p = PlotRLE(as.matrix(data_RLE_norm), id=cells_RLE_sample, col=param$col_samples) + labs(title="Normalised data")
p if (! param$integrate_samples[["method"]] %in% c("single", "merge")) {
# Get normalised data for integrated assays
data_RLE_int = GetAssayData(subset(sc, cells=cells_RLE_subset), assay=paste0(param$norm, "integrated"), slot="data")
# Plot
p = PlotRLE(as.matrix(data_RLE_int), id=cells_RLE_sample, col=param$col_samples) + labs(title="Normalised integrated data")
p
} else {
message("No integrated data available.")
}A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.
We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.
To decide how many PCs to include in downstream analyses, we visualise the cells and genes that define the PCA.
# Run PCA for default normalisation
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE)
p_list = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p = patchwork::wrap_plots(p_list, ncol = 2) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pp = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples) +
AddStyle(title="Cells arranged by the first two PCs", legend_position="bottom")
pWe next need to decide how many PCs we want to use for our analyses. The following “Elbow plot” is designed to help us make an informed decision. PCs are ranked based on the percentage of variance they explain.
For the current dataset, 10 PCs were chosen.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=20) +
geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) +
AddStyle(title="Elbow plot")
pSeurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm.
# Note: I changed the seed in ./lib/python3.6/site-packages/leidenalg/functions.py to 11 for reproducibility
# The number of clusters can be optimized by tuning 'resolution' -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE)
# Cluster using the Leiden algorithm
# Paper to Leiden algorithm: https://www.nature.com/articles/s41598-019-41695-z
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
# Taken from documentation:
# Method for running leiden (defaults to matrix which is fast for small datasets). Enable method = "igraph" to avoid casting large data to a dense matrix.
# Note Katrin:
# This is implemented as default recently and will be updated with a new Seurat release
# https://github.com/satijalab/seurat/pull/3233
sc = Seurat::FindClusters(sc, resolution=param$cluster_resolution, algorithm=4, verbose=FALSE, method="igraph")
# Construct phylogenetic tree relating the 'average' cell from each cluster
sc = BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE)
# Set up colors for clusters
cluster_names = levels(sc$seurat_clusters)
param$col_clusters = GenerateColours(num_colours=length(cluster_names), palette=param$col_palette_clusters)
names(param$col_clusters) = cluster_namesWe use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see (“Understanding Umap” 2019).
# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot"))
# 3D UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, n.components=3, reduction.name="umap3d", reduction.key="UMAP3D_", verbose=FALSE, umap.method="uwot"))
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", label=TRUE) +
AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters") +
scale_color_manual(values=param$col_clusters, labels=cluster_labels)
p# Add a UMAP that is coloured by sample of origin
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% unique() %>% sort()
# Note: This is a hack to colour by sample but label by Cluster
p = Seurat::DimPlot(sc, group.by="orig.ident", pt.size=1, cols=param$col_samples) +
AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters")
p# Count cells per cluster per sample
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% unique() %>% sort()
cell_clusters = sc[[]] %>% dplyr::pull(seurat_clusters) %>% unique() %>% sort()
tbl = mapply(function(cl) {
sapply(cell_samples, function(s) sc[[]] %>% dplyr::filter(orig.ident==s, seurat_clusters==cl) %>% nrow())
}, cell_clusters)
colnames(tbl) = paste0("Cl_", cell_clusters)
rownames(tbl) = paste0(cell_samples, "_n")
# Add percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
tbl = rbind(tbl, tbl_perc)
# Add enrichment
if (length(cell_samples) > 1) tbl = rbind(tbl, cells_fisher(sc))
# Sort
tbl = tbl[order(rownames(tbl)), ]
# Plot percentages
tbl_bar = tbl[paste0(cell_samples, "_perc"), ] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tbl_bar$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
p = ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) +
geom_bar(stat="identity" ) +
AddStyle(title="Percentage cells of samples in clusters",
fill=param$col_samples,
legend_title="Sample",
legend_position="bottom")
pThe following table shows the number of cells per sample per cluster:
In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per cluster per sample") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| Cl_1 | Cl_2 | Cl_3 | Cl_4 | Cl_5 | Cl_6 | Cl_7 | Cl_8 | Cl_9 | Cl_10 | Cl_11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| pbmc_10x_n | 704 | 577 | 578 | 410 | 369 | 305 | 251 | 202 | 122 | 51 | 39 |
| pbmc_10x_perc | 91.43 | 92.32 | 94.44 | 94.47 | 89.35 | 90.5 | 93.31 | 93.09 | 93.85 | 76.12 | 88.64 |
| pbmc_10x.oddsRatio | 0.9 | 1.04 | 1.55 | 1.53 | 0.69 | 0.8 | 1.21 | 1.17 | 1.32 | 0.26 | 0.67 |
| pbmc_10x.p | 8.0e-01 | 4.4e-01 | 9.5e-03 | 2.8e-02 | 9.9e-01 | 8.9e-01 | 2.6e-01 | 3.4e-01 | 2.9e-01 | 1.0e+00 | 8.7e-01 |
| pbmc_smartseq2_sample1_n | 66 | 48 | 34 | 24 | 44 | 32 | 18 | 15 | 8 | 16 | 5 |
| pbmc_smartseq2_sample1_perc | 8.57 | 7.68 | 5.56 | 5.53 | 10.65 | 9.5 | 6.69 | 6.91 | 6.15 | 23.88 | 11.36 |
| pbmc_smartseq2_sample1.oddsRatio | 1.12 | 0.96 | 0.65 | 0.65 | 1.45 | 1.25 | 0.82 | 0.86 | 0.76 | 3.79 | 1.5 |
| pbmc_smartseq2_sample1.p | 2.5e-01 | 6.2e-01 | 9.9e-01 | 9.8e-01 | 2.1e-02 | 1.5e-01 | 8.1e-01 | 7.5e-01 | 8.2e-01 | 4.4e-05 | 2.7e-01 |
How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.
# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score") +
AddStyle()
p2 = ggplot(sc@meta.data %>%
dplyr::group_by(seurat_clusters,Phase) %>%
dplyr::summarise(num_reads=length(Phase)),
aes(x=seurat_clusters, y=num_reads, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells") +
AddStyle()
p = p1 + p2 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases")
p# UMAP with phases superimposed
# Note: This is a hack to colour by phase but label by Cluster
p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1) +
AddStyle(title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters")
pp = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", label=TRUE, cols=c("lightgrey", param$col)) +
AddStyle(title="UMAP, cells coloured by S phase")
pp = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col), label=TRUE) +
AddStyle(title="UMAP, cells coloured by G2M phase")
pp = Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col), label=TRUE) +
AddStyle(title="UMAP, cells coloured by CC.Difference")
pDo cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
p1 = suppressMessages(Seurat::FeaturePlot(sc, features="nCount_RNA", label=TRUE) +
AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=nCount_RNA, fill=seurat_clusters)) +
geom_violin(scale="width") +
AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
xlab="Cluster", legend_position="none") +
scale_y_log10()
p = p1 | p2
p = p + patchwork::plot_annotation(title="Summed raw counts (nCount_RNA, log10 scale)")
pp1 = suppressMessages(Seurat::FeaturePlot(sc, features="nFeature_RNA", label=TRUE) +
AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=nFeature_RNA, fill=seurat_clusters)) +
geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none") +
scale_y_log10()
p = p1 | p2
p = p + patchwork::plot_annotation(title="Number of features with raw count > 0 (nFeature_RNA, log10 scale)")
pp1 = Seurat::FeaturePlot(sc, features="percent_mt", cols=c("lightgrey", param$col), label=TRUE) +
AddStyle(title="Feature plot")
p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_mt, fill=seurat_clusters)) +
geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
p = p1 | p2
p = p + patchwork::plot_annotation(title="Percent of mitochondrial features (percent_mt)")
pDo cells in individual clusters express provided known marker genes?
known_markers_list=c()
# Overwrite empty list of known markers
if (!is.null(param$file_known_markers)) {
# Read known marker genes and map to rownames
known_markers = openxlsx::read.xlsx(param$file_known_markers)
known_markers_list = lapply(colnames(known_markers), function(x) {
y = ensembl_to_seurat_rowname[known_markers[,x]] %>%
na.exclude() %>% unique() %>% sort()
m = !y %in% rownames(sc)
if (any(m)){
Warning(paste0("The following genes of marker list '", x, "' cannot be found in the data: ", first_n_elements_to_string(y[m], n=10)))
}
return(y[!m])
})
# Remove empty lists
names(known_markers_list) = colnames(known_markers)
is_empty = purrr::map_int(known_markers_list, .f=length) == 0
known_markers_list = known_markers_list[!is_empty]
}
# Set plot options
if(length(known_markers_list) > 0) {
known_markers_n = length(known_markers_list)
known_markers_vect = unlist(known_markers_list) %>% unique() %>% sort()
idx_dotplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) <= 50)
idx_avgplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) >= 10)
} else {
known_markers_n=0
idx_dotplot = idx_avgplot = FALSE
known_markers_vect = c()
}# Dotplots and average feature plots
# The height of 1 row (= 1 plot) is fixed to 5
fig_height_knownMarkers_dotplot = max(5, 5 * sum(idx_dotplot))
fig_height_knownMarkers_avgplot = max(5, 5 * sum(idx_avgplot))
# Individual feature plots
# Each row contains 2 plots
# We fix the height of each plot to the same height as is used later for DEGs
height_per_row = max(2, 0.3 * length(unique(Idents(sc))))
nr_rows = ceiling(length(known_markers_vect)/2)
fig_height_knownMarkers_vect = max(5, height_per_row * nr_rows)You provided 7 list(s) of known marker genes. In the following tabs, you find:
A dot plot visualises how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that expresses the gene, while the color encodes the scaled average expression across all cells within the cluster. Per gene, we group cells based on cluster identity, calculate average expression per cluster, subtract the mean of average expression values and divide by the standard deviation. The resulting scores describe how high or low a gene is expressed in a cluster compared to all other clusters.
if ((known_markers_n > 0) & any(idx_dotplot)) {
known_markers_dotplot = known_markers_list[idx_dotplot]
p_list = list()
for (i in seq(known_markers_dotplot)) {
g = known_markers_dotplot[[i]]
g = g[length(g):1]
p_list[[i]] = suppressMessages(Seurat::DotPlot(sc, features=g) +
scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste("Known marker genes:", names(known_markers_dotplot)[i]),
ylab="Cluster") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)))
}
p = patchwork::wrap_plots(p_list, ncol=1)
print(p)
} else if ((known_markers_n > 0) & !any(idx_dotplot)) {
message("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
} else {
message("No known marker genes were provided and hence dot plots are skipped.")
}An average feature plot visualises the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.
if ((known_markers_n > 0) & any(idx_avgplot)) {
known_markers_avgplot = known_markers_list[idx_avgplot]
sc = Seurat::AddModuleScore(sc, features=known_markers_avgplot, ctrl=10, name="known_markers")
idx_replace_names = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
p_list = Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(known_markers_avgplot)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=paste("Known marker genes:", names(known_markers_avgplot)[i]))
}
p = patchwork::wrap_plots(p_list, ncol=1)
print(p)
} else if ((known_markers_n > 0) & !any(idx_avgplot)) {
message("This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
} else {
message("No known marker genes were provided and hence average feature plots are skipped.")
}× (Message)
This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.
An individual feature plot colours single cells on the UMAP according to their normalised gene expression.
if ((known_markers_n > 0) & length(known_markers_vect) <= 100) {
p_list = Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p = patchwork::wrap_plots(p_list, ncol=2)
print(p)
} else if (length(known_markers_vect) > 100) {
message("This tab is used to plot up to 100 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
} else {
message("No known marker genes were provided and hence individual feature plots are skipped.")
}We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw “RNA” data and the method “MAST”. Resulting p-values are adjusted using the Bonferroni method. However, note that the p-values are likely inflated, since both clusters and marker genes were determined based on the same gene expression data, and there ought to be gene expression differences by design. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
# https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
markers = suppressMessages(Seurat::FindAllMarkers(sc, assay="RNA", test.use="MAST",
only.pos=FALSE, min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE))
# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))
# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
lfc_idx = grep("avg_log\\S*FC", colnames(markers))
markers[,lfc_idx] = marker_deg_results[,lfc_idx] / log(2)
col_nms = colnames(markers)
col_nms[2] = "avg_log2FC"
colnames(markers) = col_nms
}
# Sort markers
markers = markers %>% DegsSort(group=c("cluster"))
# Filter markers
markers_filt = DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_found = nrow(markers_filt$all)>0
# Add average data to table
markers_out = cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene))
# Split by cluster and write to file
additional_readme = data.frame(Column=c("cluster",
"p_val_adj_score",
"avg_<assay>_<slot>_id<cluster>"),
Description=c("Cluster",
"Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
"Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))
markers_results_file = DegsWriteToFile(split(markers_out, markers_out$cluster),
annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
additional_readme=additional_readme,
file=paste0(param$path_out, "/markers_cluster_vs_rest.xlsx"))
# Plot number of differentially expressed genes
p = DegsPlotNumbers(markers_filt$all,
group="cluster",
title=paste0("Number of DEGs, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")"))
if (markers_found) {
print(p)
} else {
warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.
if (markers_found) {
markers_top = DegsUpDisplayTop(markers_filt$up, n=5)
# Show table
knitr::kable(markers_top, align="l", caption="Up to top 5 marker genes per cell cluster") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")
}| cluster | gene | avg_log2FC | p_val | p_val_adj | pct.1 | pct.2 |
|---|---|---|---|---|---|---|
| 1 | GZMH | 2.150 | 0.0e+00 | 0.0e+00 | 0.986 | 0.264 |
| 1 | NKG7 | 1.842 | 1.2e-316 | 2.6e-312 | 1.000 | 0.513 |
| 1 | FGFBP2 | 1.935 | 1.8e-282 | 3.8e-278 | 0.908 | 0.241 |
| 1 | CCL5.1 | 1.597 | 6.0e-280 | 1.3e-275 | 1.000 | 0.489 |
| 1 | GZMB | 1.395 | 5.4e-264 | 1.1e-259 | 0.887 | 0.221 |
| 2 | DUSP2 | 1.221 | 5.4e-91 | 1.2e-86 | 0.773 | 0.422 |
| 2 | GZMK | 1.627 | 2.2e-69 | 4.6e-65 | 0.379 | 0.093 |
| 2 | CD8A | 1.010 | 6.2e-62 | 1.3e-57 | 0.682 | 0.336 |
| 2 | CMC1 | 1.084 | 2.0e-39 | 4.2e-35 | 0.322 | 0.122 |
| 3 | IL7R | 1.768 | 1.8e-244 | 3.9e-240 | 0.946 | 0.292 |
| 3 | LTB | 1.850 | 1.5e-235 | 3.2e-231 | 0.938 | 0.337 |
| 3 | LDHB | 1.173 | 2.3e-150 | 4.8e-146 | 0.943 | 0.623 |
| 3 | ZFP36L2 | 1.304 | 3.0e-146 | 6.3e-142 | 0.989 | 0.833 |
| 3 | LEPROTL1 | 1.134 | 5.5e-133 | 1.2e-128 | 0.923 | 0.534 |
| 4 | CD79A | 3.997 | 0.0e+00 | 0.0e+00 | 1.000 | 0.018 |
| 4 | HLA-DQA1 | 3.049 | 0.0e+00 | 0.0e+00 | 0.979 | 0.029 |
| 4 | MS4A1 | 2.844 | 0.0e+00 | 0.0e+00 | 0.878 | 0.016 |
| 4 | LINC00926 | 2.627 | 0.0e+00 | 0.0e+00 | 0.871 | 0.019 |
| 4 | BANK1 | 2.392 | 0.0e+00 | 0.0e+00 | 0.836 | 0.011 |
| 5 | S100A9 | 6.066 | 0.0e+00 | 0.0e+00 | 0.998 | 0.062 |
| 5 | VCAN | 4.367 | 0.0e+00 | 0.0e+00 | 0.988 | 0.026 |
| 5 | FCN1 | 3.983 | 0.0e+00 | 0.0e+00 | 0.993 | 0.053 |
| 5 | CD14 | 3.465 | 0.0e+00 | 0.0e+00 | 0.952 | 0.009 |
| 5 | CLEC7A | 2.696 | 0.0e+00 | 0.0e+00 | 0.959 | 0.056 |
| 6 | KLRF1 | 2.245 | 3.0e-249 | 6.4e-245 | 0.807 | 0.044 |
| 6 | GNLY | 2.917 | 2.0e-210 | 4.3e-206 | 0.976 | 0.315 |
| 6 | CD247 | 1.937 | 1.1e-205 | 2.4e-201 | 0.967 | 0.457 |
| 6 | PRF1 | 2.351 | 6.1e-192 | 1.3e-187 | 0.973 | 0.363 |
| 6 | GZMB | 2.231 | 2.6e-191 | 5.5e-187 | 0.953 | 0.296 |
| 7 | LEF1 | 1.709 | 5.3e-151 | 1.1e-146 | 0.758 | 0.078 |
| 7 | CCR7 | 1.749 | 2.4e-139 | 5.2e-135 | 0.770 | 0.095 |
| 7 | TCF7 | 1.382 | 6.1e-95 | 1.3e-90 | 0.859 | 0.314 |
| 7 | IL7R | 1.096 | 1.6e-88 | 3.3e-84 | 0.948 | 0.354 |
| 7 | TRABD2A | 1.231 | 4.6e-84 | 9.8e-80 | 0.561 | 0.077 |
| 8 | KLRB1 | 2.661 | 7.8e-131 | 1.7e-126 | 0.871 | 0.188 |
| 8 | GZMK | 1.832 | 9.2e-113 | 2.0e-108 | 0.788 | 0.101 |
| 8 | TNFAIP3 | 1.615 | 8.3e-77 | 1.8e-72 | 0.954 | 0.585 |
| 8 | DUSP2 | 1.522 | 2.7e-68 | 5.7e-64 | 0.935 | 0.451 |
| 8 | AQP3 | 1.862 | 2.4e-63 | 5.1e-59 | 0.507 | 0.104 |
| 9 | LST1 | 3.941 | 7.6e-245 | 1.6e-240 | 1.000 | 0.198 |
| 9 | IFITM3 | 3.400 | 1.2e-208 | 2.5e-204 | 0.992 | 0.268 |
| 9 | FCGR3A | 3.363 | 5.1e-200 | 1.1e-195 | 1.000 | 0.152 |
| 9 | AIF1 | 3.580 | 1.5e-186 | 3.2e-182 | 1.000 | 0.166 |
| 9 | CDKN1C.1 | 3.276 | 6.2e-185 | 1.3e-180 | 0.923 | 0.020 |
| 10 | RGS10 | 4.206 | 5.3e-275 | 1.1e-270 | 0.776 | 0.407 |
| 10 | MAX | 4.290 | 2.3e-253 | 4.9e-249 | 0.851 | 0.292 |
| 10 | TAGLN2 | 4.199 | 2.0e-249 | 4.2e-245 | 0.940 | 0.709 |
| 10 | OST4 | 3.347 | 6.8e-210 | 1.4e-205 | 0.866 | 0.791 |
| 10 | TUBA4A | 4.447 | 7.2e-205 | 1.5e-200 | 0.851 | 0.402 |
| 11 | HLA-DPB1.2 | 3.436 | 6.9e-82 | 1.5e-77 | 1.000 | 0.570 |
| 11 | HLA-DPA1.2 | 3.639 | 1.6e-80 | 3.4e-76 | 1.000 | 0.550 |
| 11 | FCER1A | 2.980 | 2.4e-59 | 5.1e-55 | 0.773 | 0.009 |
| 11 | CD74 | 3.186 | 2.3e-53 | 5.0e-49 | 1.000 | 0.775 |
| 11 | PLD4 | 2.314 | 4.6e-52 | 9.8e-48 | 0.909 | 0.042 |
The following plots are exemplary to how we can visualise differentially expressed genes using the Seurat R-package. The selected genes are the top marker genes for each cluster, respectively.
if (markers_found) {
# Get top 1 gene per cluster and plot
if (nrow(markers_filt$all) > 0) {
markers_examples = markers_filt$up %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=1, wt=avg_log2FC) %>%
dplyr::select(cluster, gene) %>%
as.data.frame()
} else {
markers_examples = data.frame(cluster=numeric(), gene=character())
}
markers_examples = setNames(markers_examples$gene, paste0(markers_examples$cluster, ": ", markers_examples$gene))
}# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {
# Each row contains 2 plots
nr_rows = ceiling(length(markers_examples)/2)
# The height of each plot might depend on the number of clusters
height_per_row = max(2, 0.3 * length(unique(Idents(sc))))
# Total height of plots
fig_height_markers = max(5, height_per_row * nr_rows)
} else {
fig_height_markers = 7
}if (markers_found) {
# Shows gene expression on the UMAP
p_list = Seurat::FeaturePlot(sc, features=markers_examples, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(title=names(markers_examples)[i])
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top 1 marker gene per cluster")
p
}if (markers_found) {
# Violin plot of raw gene expression counts
p_list = Seurat::VlnPlot(sc, features=markers_examples, assay="RNA", slot="counts", combine=FALSE, pt.size=0.2)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(title=names(markers_examples)[i],
legend_title="Cluster",
fill=param$col_clusters,
xlab="Cluster")
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Violin plot of raw gene expression counts, top 1 marker gene per cluster") +
patchwork::plot_layout(guides = "collect") & theme(legend.position="bottom")
suppressMessages(p)
}if (markers_found) {
# Violin plot of normalised gene expression data
p_list = Seurat::VlnPlot(sc, features=markers_examples, combine=FALSE, pt.size=0.2)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(title=names(markers_examples)[i],
legend_title="Cluster",
fill=param$col_clusters,
xlab="Cluster")
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Violin plot of normalised gene expression data, top 1 marker gene per cluster") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
suppressMessages(p)
}if (markers_found) {
# Ridge plot of raw gene expression counts
p_list = Seurat::RidgePlot(sc, features=markers_examples, assay="RNA", slot="counts", combine=FALSE)
for (i in seq(p_list)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=names(markers_examples)[i],
legend_title="Cluster",
fill=param$col_clusters,
ylab="Cluster")
}
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of raw gene expression counts, top 1 marker gene per cluster") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
suppressMessages(p)
}if (markers_found) {
# Ridge plot of normalised gene expression data
p_list = Seurat::RidgePlot(sc, features=markers_examples, slot="data", combine=FALSE)
for (i in seq(p_list)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=names(markers_examples)[i],
legend_title="Cluster",
fill=param$col_clusters,
ylab="Cluster")
}
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of normalised and log-transformed gene expression data, top 1 marker gene per cluster") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
suppressMessages(p)
}if (markers_found) {
# Visualises how feature expression changes across different clusters
p_list = lapply(levels(sc$seurat_clusters), function(cl) {
genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
if (length(genes) > 0) {
genes = genes[length(genes):1]
p = suppressMessages(Seurat::DotPlot(sc, features=genes) +
scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste0("Top markers (up-regulated genes) for cluster ", cl), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
return(p)
} else {
message("No up-regulated genes for cluster ", cl, ".")
return(NULL)
}
})
p = patchwork::wrap_plots(p_list, ncol=2)
print(p)
}If the dataset contains multiple samples, we can visualise the expression of a gene that is up-regulated in a cluster separately for each sample. For each cluster, we extract up-regulated genes, and visualise expression of these genes in all cells in that cluster, split by their sample of origin.
fig_height_degs_per_cl = max(5,
max(2, 0.3 * (sc$orig.ident %>% unique() %>% length())) * length(levels(sc$seurat_clusters)) * 2)First, we plot scaled expression as explained above (see section Known marker genes). This plot allows us to judge whether the expression of a gene is increased in one sample as compared to the other samples.
p_list = list()
markers_filt_up_top = DegsUpDisplayTop(degs=markers_filt$up, n=50)
for (cl in levels(sc$seurat_clusters)) {
markers_filt_up_cl_top = markers_filt_up_top %>%
dplyr::filter(cluster==cl) %>%
dplyr::pull(gene)
if (length(markers_filt_up_cl_top) > 0) {
p_list[[cl]] = suppressMessages(Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident") +
scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste0("Up to 50 markers (up-regulated genes) for cluster ", cl), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
}
}
p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster")
pSecond, we plot normalised expression with no further scaling. This plot helps to get an impression of the total expression of a gene.
n_genes_max_dotplot = 50
p_list = list()
for (cl in levels(sc$seurat_clusters)) {
markers_filt_up_cl_top = markers_filt$up %>%
dplyr::filter(cluster==cl) %>%
dplyr::top_n(n=n_genes_max_dotplot, wt=p_val_adj_score) %>%
dplyr::pull(gene)
if (length(markers_filt_up_cl_top) > 0) {
p_list[[cl]] = Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident", scale=FALSE, cols=c("lightgrey", param$col)) +
AddStyle(title=paste0("Up to ", n_genes_max_dotplot, " markers (up-regulated genes) for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1))
}
}
p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster (not scaled)")
pif (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE)
# Heatmap of all differentially expressed genes
p = Seurat::DoHeatmap(sc, features=markers_filt$all$gene, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
NoLegend() +
theme(axis.text.y=element_blank()) +
ggtitle("Heatmap of scaled gene expression data, all genes differentially expressed between a cluster and the rest")
p
}if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE)
# With fig.height = 20, 300 features can be shown; distribute among clusters
features_per_group = 300/length(levels(markers_filt$up$cluster))
features_subset = markers_filt$up %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=features_per_group, wt=avg_log2FC) %>%
dplyr::arrange(cluster, -avg_log2FC) %>%
dplyr::pull(gene) %>%
unique()
# Heatmap of top up-regulated genes
p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
NoLegend() +
theme(axis.text.y=element_text(size=8)) +
ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
p
}if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE)
# With fig.height = 20, 300 features can be shown; distribute among clusters
features_per_group = 300/length(levels(markers_filt$down$cluster))
features_subset = markers_filt$down %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=features_per_group, wt=-avg_log2FC) %>%
dplyr::arrange(cluster, avg_log2FC) %>%
dplyr::pull(gene) %>%
unique()
# Heatmap of top down-regulated genes
p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
NoLegend() +
theme(axis.text.y=element_text(size=8)) +
ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
p
}To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Chen 2020). You can choose to test functional enrichment from a wide range of databases:
dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| geneCoverage | genesPerTerm | libraryName | link | numTerms |
|---|---|---|---|---|
| 13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 |
| 27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 |
| 6002 | 77 | Transcription_Factor_PPIs | 290 | |
| 47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 |
| 47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 |
| 21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 |
| 1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 |
| 3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 |
| 2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 |
| 15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 |
| 34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 |
| 7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 |
| 16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 |
| 12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 |
| 23726 | 127 | GeneSigDB | http://genesigdb.org/genesigdb/downloadall.jsp | 2139 |
| 32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 |
| 13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 |
| 19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 |
| 13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 |
| 14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 |
| 3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 |
| 22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 |
| 4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 |
| 10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 |
| 2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 |
| 5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 |
| 10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 |
| 10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 |
| 11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 |
| 8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 |
| 1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 |
| 2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 |
| 851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 |
| 10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 |
| 11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 |
| 15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 |
| 12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 |
| 13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 |
| 6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 |
| 3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 |
| 7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 |
| 7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 |
| 7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 |
| 8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 |
| 13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 |
| 26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 |
| 29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 |
| 280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 |
| 13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 |
| 15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 |
| 4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 |
| 4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 |
| 10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 |
| 1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 |
| 756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 |
| 3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 |
| 2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 |
| 1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 |
| 5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 |
| 6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 |
| 25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 |
| 19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 |
| 23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 |
| 24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 |
| 31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 |
| 5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 |
| 9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 |
| 9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 |
| 16725 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_down | http://www.gtexportal.org/ | 2918 |
| 19249 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_up | http://www.gtexportal.org/ | 2918 |
| 15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 |
| 3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 |
| 12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 |
| 12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 |
| 8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 |
| 7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 |
| 5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 |
| 15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | |
| 17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 |
| 934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 |
| 2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 |
| 2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 |
| 5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 |
| 49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 |
| 2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 |
| 19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 |
| 22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 |
| 8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 |
| 18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 |
| 15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 |
| 10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 |
| 10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 |
| 10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 |
| 13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 |
| 8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 |
| 10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 |
| 13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 |
| 21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 |
| 23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 |
| 20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 |
| 19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 |
| 25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 |
| 19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 |
| 14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 |
| 17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 |
| 5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 |
| 12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 |
| 1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 |
| 19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 |
| 14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 |
| 8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 |
| 11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 |
| 19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 |
| 27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 |
| 13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 |
| 13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 |
| 10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 |
| 19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 |
| 6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 |
| 4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 |
| 3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 |
| 7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 |
| 8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 |
| 12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 |
| 9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 |
| 7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 |
| 6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 |
| 13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 |
| 14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 |
| 9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 |
| 1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 |
| 9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 |
| 17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 |
| 394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 |
| 11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 |
| 8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 |
| 18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 |
| 5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 |
| 5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 |
| 14156 | 40 | Table_Mining_of_CRISPR_Studies | 802 | |
| 16979 | 295 | COVID-19_Related_Gene_Sets | https://amp.pharm.mssm.edu/covid19 | 205 |
| 4383 | 146 | MSigDB_Hallmark_2020 | https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp | 50 |
| 54974 | 483 | Enrichr_Users_Contributed_Lists_2020 | https://maayanlab.cloud/Enrichr | 1482 |
| 12118 | 448 | TG_GATES_2020 | https://toxico.nibiohn.go.jp/english/ | 1190 |
| 12361 | 124 | Allen_Brain_Atlas_10x_scRNA_2021 | https://portal.brain-map.org/ | 766 |
if (markers_found) {
# Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
marker_genesets_up = sapply(levels(sc$seurat_clusters), function(x) {
tmp = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
return(tmp[!is.na(tmp)])
}, USE.NAMES=TRUE, simplify=TRUE)
# Tests done by Enrichr
marker_genesets_up_enriched = purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
# Write to files
marker_genesets_up_enriched_files = purrr::map(names(marker_genesets_up_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
file=paste0(param$path_out, "/Functions_marker_up_cluster", n, ".xlsx"))
})
# Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
marker_genesets_down = sapply(levels(sc$seurat_clusters), function(x) {
tmp = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
return(tmp[!is.na(tmp)])
}, USE.NAMES=TRUE, simplify=TRUE)
# Tests done by Enrichr
marker_genesets_down_enriched = purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
# Write to files
marker_genesets_down_enriched_files = purrr::map(names(marker_genesets_down_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
file=paste0(param$path_out, "/Functions_marker_down_cluster", n, ".xlsx"))
})
}The following table contains the top enriched terms per cluster.
# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
cat('#### {.tabset} \n \n')
# Get top ten up over all databases
marker_genesets_up_top_enriched = purrr::map(marker_genesets_up_enriched, function(enrichr_results) {
purrr::map_dfr(names(enrichr_results), function(n) {
enrichr_result_df = cbind(enrichr_results[[n]], list(
Database = factor(rep(n, nrow(enrichr_results[[n]])), levels=names(enrichr_results)),
Direction = factor(rep("up", nrow(enrichr_results[[n]])), levels=c("up", "down"))
))
return(enrichr_result_df)
}) %>% head()
})
# Get down ten up over all databases
marker_genesets_down_top_enriched = purrr::map(marker_genesets_down_enriched, function(enrichr_results) {
purrr::map_dfr(names(enrichr_results), function(n) {
enrichr_result_df = enrichr_results[[n]]
if (nrow(enrichr_result_df) > 0 ) {
enrichr_result_df$Database = n
enrichr_result_df$Direction = "down"
} else {
enrichr_result_df = data.frame(enrichr_result_df, Database=as.character(), Direction=as.character())
}
return(enrichr_result_df)
}) %>% head()
})
# Join
marker_genesets_top_enriched = purrr::map(values_to_names(names(marker_genesets_down_top_enriched)), function(n) {
return(rbind(marker_genesets_up_top_enriched[[n]], marker_genesets_down_top_enriched[[n]]) %>%
dplyr::arrange(-Odds.Ratio))
})
# Print as tabs
for(n in names(marker_genesets_top_enriched)){
cat('##### ', n, ' \n')
cat(knitr::kable(marker_genesets_top_enriched[[n]][, c("Database", "Term", "Direction", "Adjusted.P.value", "Odds.Ratio")],
align="l", caption="Top ten enriched terms per geneset", format="html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%"))
cat(' \n \n')
}
cat(' \n \n')
}| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | MHC class I protein binding (GO:0042288) | up | 0.0026977 | 208.020833 |
| GO_Molecular_Function_2018 | MHC protein binding (GO:0042287) | up | 0.0042620 | 89.080357 |
| GO_Molecular_Function_2018 | serine-type endopeptidase activity (GO:0004252) | up | 0.0066157 | 20.506736 |
| GO_Molecular_Function_2018 | serine-type peptidase activity (GO:0008236) | up | 0.0073965 | 18.216590 |
| GO_Molecular_Function_2018 | endopeptidase activity (GO:0004175) | up | 0.0042620 | 16.026122 |
| GO_Molecular_Function_2018 | protein homodimerization activity (GO:0042803) | up | 0.0165146 | 8.364502 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | MAP kinase tyrosine/serine/threonine phosphatase activity (GO:0017017) | up | 0.0071955 | 1110.555556 |
| GO_Molecular_Function_2018 | MAP kinase phosphatase activity (GO:0033549) | up | 0.0071955 | 832.833333 |
| GO_Molecular_Function_2018 | MHC class I protein binding (GO:0042288) | up | 0.0074593 | 512.384615 |
| GO_Molecular_Function_2018 | MHC protein binding (GO:0042287) | up | 0.0108530 | 229.505747 |
| GO_Molecular_Function_2018 | protein tyrosine/serine/threonine phosphatase activity (GO:0008138) | up | 0.0108530 | 201.646465 |
| GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | down | 0.0000180 | 113.236467 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | down | 0.0000064 | 77.852665 |
| GO_Molecular_Function_2018 | immunoglobulin binding (GO:0019865) | down | 0.0017802 | 63.149364 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | down | 0.0000084 | 61.160714 |
| GO_Molecular_Function_2018 | phosphoprotein phosphatase activity (GO:0004721) | up | 0.0359009 | 49.039506 |
| GO_Molecular_Function_2018 | exopeptidase activity (GO:0008238) | down | 0.0064905 | 16.542005 |
| GO_Molecular_Function_2018 | transcription factor activity, RNA polymerase II core promoter proximal region sequence-specific binding (GO:0000982) | down | 0.0105850 | 5.103332 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | mRNA 3’-UTR AU-rich region binding (GO:0035925) | up | 0.0282789 | 399.68000 |
| GO_Molecular_Function_2018 | AU-rich element binding (GO:0017091) | up | 0.0282789 | 199.74000 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | down | 0.0000032 | 156.44706 |
| GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | down | 0.0020105 | 138.56944 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | down | 0.0001341 | 131.53187 |
| GO_Molecular_Function_2018 | mRNA 3’-UTR binding (GO:0003730) | up | 0.0470058 | 64.29677 |
| GO_Molecular_Function_2018 | cytokine receptor activity (GO:0004896) | up | 0.0470058 | 55.33889 |
| GO_Biological_Process_2018 | cellular response to tumor necrosis factor (GO:0071356) | up | 0.0162858 | 51.56771 |
| GO_Molecular_Function_2018 | oxidoreductase activity, acting on the CH-OH group of donors, NAD or NADP as acceptor (GO:0016616) | up | 0.0470058 | 45.76322 |
| GO_Molecular_Function_2018 | serine-type endopeptidase activity (GO:0004252) | down | 0.0008508 | 15.68380 |
| GO_Molecular_Function_2018 | serine-type peptidase activity (GO:0008236) | down | 0.0008849 | 13.91614 |
| GO_Molecular_Function_2018 | endopeptidase activity (GO:0004175) | down | 0.0008849 | 10.56789 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | up | 0.0000003 | 289.043478 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | up | 0.0000000 | 271.909091 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | up | 0.0000000 | 209.129371 |
| GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | down | 0.0007908 | 179.126126 |
| GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | up | 0.0000000 | 49.544900 |
| GO_Molecular_Function_2018 | disordered domain specific binding (GO:0097718) | down | 0.0159422 | 26.845946 |
| GO_Molecular_Function_2018 | transcription factor activity, RNA polymerase II distal enhancer sequence-specific binding (GO:0003705) | up | 0.0158176 | 20.811650 |
| GO_Molecular_Function_2018 | amyloid-beta binding (GO:0001540) | down | 0.0128799 | 16.033131 |
| GO_Molecular_Function_2018 | actin binding (GO:0003779) | down | 0.0116168 | 6.025464 |
| GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | down | 0.0159422 | 4.845283 |
| GO_Molecular_Function_2018 | protein homodimerization activity (GO:0042803) | up | 0.0286006 | 4.780397 |
| GO_Molecular_Function_2018 | protein homodimerization activity (GO:0042803) | down | 0.0159422 | 3.470588 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | down | 0.0001246 | 174.385965 |
| GO_Molecular_Function_2018 | ubiquitin-protein transferase inhibitor activity (GO:0055105) | down | 0.0001246 | 174.385965 |
| GO_Molecular_Function_2018 | Toll-like receptor binding (GO:0035325) | up | 0.0013078 | 59.550189 |
| GO_Molecular_Function_2018 | rRNA binding (GO:0019843) | down | 0.0000005 | 35.083081 |
| GO_Molecular_Function_2018 | phosphotyrosine residue binding (GO:0001784) | up | 0.0044074 | 15.551610 |
| GO_Molecular_Function_2018 | protein phosphorylated amino acid binding (GO:0045309) | up | 0.0053564 | 12.867032 |
| GO_Molecular_Function_2018 | mRNA binding (GO:0003729) | down | 0.0000065 | 10.901952 |
| GO_Molecular_Function_2018 | RNA binding (GO:0003723) | down | 0.0000000 | 8.320770 |
| GO_Molecular_Function_2018 | kinase binding (GO:0019900) | down | 0.0000506 | 6.011728 |
| GO_Molecular_Function_2018 | protein heterodimerization activity (GO:0046982) | up | 0.0053564 | 3.925270 |
| GO_Molecular_Function_2018 | transition metal ion binding (GO:0046914) | up | 0.0053564 | 3.194749 |
| GO_Molecular_Function_2018 | protein homodimerization activity (GO:0042803) | up | 0.0020329 | 2.922092 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | down | 0.0005745 | 293.72059 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | down | 0.0000157 | 288.00000 |
| GO_Biological_Process_2018 | granzyme-mediated apoptotic signaling pathway (GO:0008626) | up | 0.0042951 | 269.68919 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | down | 0.0000157 | 233.96484 |
| GO_Molecular_Function_2018 | beta-galactosidase activity (GO:0004565) | down | 0.0284356 | 221.95556 |
| GO_Biological_Process_2018 | lymphocyte mediated immunity (GO:0002449) | up | 0.0093912 | 119.83183 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | up | 0.0003120 | 103.88021 |
| GO_Molecular_Function_2018 | MHC protein binding (GO:0042287) | down | 0.0041184 | 83.83613 |
| GO_Biological_Process_2018 | cellular defense response (GO:0006968) | up | 0.0004909 | 43.75604 |
| GO_Molecular_Function_2018 | cytokine receptor activity (GO:0004896) | down | 0.0194262 | 32.99089 |
| GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | up | 0.0000000 | 28.21577 |
| GO_Biological_Process_2018 | apoptotic process (GO:0006915) | up | 0.0047921 | 12.84162 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Biological_Process_2018 | response to interleukin-4 (GO:0070670) | up | 0.0018866 | 475.690476 |
| GO_Biological_Process_2018 | cellular response to interleukin-4 (GO:0071353) | up | 0.0018866 | 475.690476 |
| GO_Biological_Process_2018 | positive regulation of interleukin-12 production (GO:0032735) | up | 0.0104144 | 144.659420 |
| GO_Molecular_Function_2018 | phospholipase inhibitor activity (GO:0004859) | down | 0.0002426 | 143.920482 |
| GO_Biological_Process_2018 | beta-catenin-TCF complex assembly (GO:1904837) | up | 0.0109625 | 118.797619 |
| GO_Biological_Process_2018 | regulation of phosphatidylinositol 3-kinase activity (GO:0043551) | up | 0.0109625 | 107.284946 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | down | 0.0001011 | 64.712195 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | down | 0.0017751 | 55.331789 |
| GO_Biological_Process_2018 | canonical Wnt signaling pathway (GO:0060070) | up | 0.0364803 | 46.748826 |
| GO_Molecular_Function_2018 | cadherin binding involved in cell-cell adhesion (GO:0098641) | down | 0.0024344 | 44.950301 |
| GO_Molecular_Function_2018 | protein binding involved in cell-cell adhesion (GO:0098632) | down | 0.0027672 | 39.951807 |
| GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | down | 0.0000172 | 9.524592 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | urea transmembrane transporter activity (GO:0015204) | up | 0.0376939 | 285.42857 |
| GO_Molecular_Function_2018 | MAP kinase tyrosine/serine/threonine phosphatase activity (GO:0017017) | up | 0.0376939 | 237.84524 |
| GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | down | 0.0009057 | 217.03261 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | down | 0.0000445 | 209.39161 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | down | 0.0000445 | 170.10511 |
| GO_Molecular_Function_2018 | collagen binding (GO:0005518) | down | 0.0006591 | 55.45269 |
| GO_Molecular_Function_2018 | RNA polymerase II regulatory region DNA binding (GO:0001012) | up | 0.0270704 | 24.98359 |
| GO_Molecular_Function_2018 | serine-type endopeptidase activity (GO:0004252) | down | 0.0010400 | 19.62599 |
| GO_Molecular_Function_2018 | transcriptional activator activity, RNA polymerase II transcription regulatory region sequence-specific binding (GO:0001228) | up | 0.0270704 | 17.53025 |
| GO_Molecular_Function_2018 | serine-type peptidase activity (GO:0008236) | down | 0.0013527 | 17.42416 |
| GO_Molecular_Function_2018 | transcription regulatory region sequence-specific DNA binding (GO:0000976) | up | 0.0270704 | 17.03806 |
| GO_Molecular_Function_2018 | RNA polymerase II regulatory region sequence-specific DNA binding (GO:0000977) | up | 0.0376939 | 10.68271 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | down | 0.0002190 | 209.463158 |
| GO_Molecular_Function_2018 | ubiquitin-protein transferase inhibitor activity (GO:0055105) | down | 0.0136554 | 103.635417 |
| GO_Molecular_Function_2018 | cysteine-type peptidase activity (GO:0008234) | up | 0.0023542 | 10.624697 |
| GO_Molecular_Function_2018 | cysteine-type endopeptidase activity (GO:0004197) | up | 0.0011493 | 9.927904 |
| GO_Molecular_Function_2018 | mRNA binding (GO:0003729) | down | 0.0122091 | 7.437422 |
| GO_Molecular_Function_2018 | protease binding (GO:0002020) | up | 0.0076689 | 6.606191 |
| GO_Molecular_Function_2018 | kinase binding (GO:0019900) | down | 0.0025145 | 5.429479 |
| GO_Molecular_Function_2018 | RNA binding (GO:0003723) | down | 0.0000076 | 4.411334 |
| GO_Molecular_Function_2018 | protein kinase binding (GO:0019901) | down | 0.0202875 | 4.039950 |
| GO_Molecular_Function_2018 | kinase binding (GO:0019900) | up | 0.0011493 | 4.011993 |
| GO_Molecular_Function_2018 | protein kinase binding (GO:0019901) | up | 0.0011493 | 3.861584 |
| GO_Molecular_Function_2018 | protein homodimerization activity (GO:0042803) | up | 0.0011493 | 3.400355 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | mRNA 5’-UTR binding (GO:0048027) | down | 0.0000000 | 33.073659 |
| GO_Molecular_Function_2018 | rRNA binding (GO:0019843) | down | 0.0000000 | 17.288046 |
| GO_Molecular_Function_2018 | translation factor activity, RNA binding (GO:0008135) | down | 0.0000000 | 12.580069 |
| GO_Molecular_Function_2018 | RNA binding (GO:0003723) | down | 0.0000000 | 9.787102 |
| GO_Molecular_Function_2018 | mRNA binding (GO:0003729) | down | 0.0000000 | 9.346812 |
| GO_Molecular_Function_2018 | actin filament binding (GO:0051015) | up | 0.0000837 | 7.121103 |
| GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | down | 0.0000000 | 6.300408 |
| GO_Molecular_Function_2018 | actin binding (GO:0003779) | up | 0.0000036 | 5.620882 |
| GO_Molecular_Function_2018 | GTPase activity (GO:0003924) | up | 0.0004073 | 4.603610 |
| GO_Molecular_Function_2018 | ubiquitin protein ligase binding (GO:0031625) | up | 0.0004073 | 4.099573 |
| GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | up | 0.0004073 | 3.952044 |
| GO_Molecular_Function_2018 | kinase binding (GO:0019900) | up | 0.0004073 | 3.476703 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio |
|---|---|---|---|---|
| GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | down | 0.0000002 | 664.466667 |
| GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | up | 0.0000180 | 113.236467 |
| GO_Molecular_Function_2018 | RAGE receptor binding (GO:0050786) | down | 0.0110879 | 91.838710 |
| GO_Molecular_Function_2018 | Toll-like receptor binding (GO:0035325) | down | 0.0110879 | 91.838710 |
| GO_Molecular_Function_2018 | MHC class I protein binding (GO:0042288) | down | 0.0008597 | 89.083457 |
| GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | up | 0.0000064 | 77.852665 |
| GO_Molecular_Function_2018 | immunoglobulin binding (GO:0019865) | up | 0.0017802 | 63.149364 |
| GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | up | 0.0000084 | 61.160714 |
| GO_Molecular_Function_2018 | MHC protein binding (GO:0042287) | down | 0.0061629 | 36.264117 |
| GO_Molecular_Function_2018 | SH3 domain binding (GO:0017124) | down | 0.0188089 | 18.805801 |
| GO_Molecular_Function_2018 | exopeptidase activity (GO:0008238) | up | 0.0064905 | 16.542005 |
| GO_Molecular_Function_2018 | transcription factor activity, RNA polymerase II core promoter proximal region sequence-specific binding (GO:0000982) | up | 0.0105850 | 5.103332 |
If requested, we identify genes that are differentially expressed between two groups of cells. Groups can be defined by columns in the cell metadata. Different types of tests can be used and input data for testing can be the different assays as well as the computed dimensionality reductions. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
# We first compute the DEGs for all requested contrasts
# Prepare a list with contrasts (input can be R data.frame table or Excel file)
degs_contrasts_list = DegsSetupContrastsList(sc, param$deg_contrasts, param$latent_vars)
# Add the actual data to the list
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){
# If there were already errors, just return
if (length(contrast[["error_messages"]]) > 0) return(c(contrast, list(object=NULL, cells_group1_idx_subset=as.integer(), cells_group2_idx_subset=as.integer())))
# Get cells indices
cells_group1_idx = contrast[["cells_group1_idx"]]
cells_group2_idx = contrast[["cells_group2_idx"]]
# Create object
if (contrast[["use_reduction"]]) {
# Use dimensionality reduction
contrast[["object"]] = Seurat::Reductions(sc, slot=contrast[["assay"]])
} else {
# Use assay
contrast[["object"]] = Seurat::GetAssay(sc[,unique(c(cells_group1_idx,cells_group2_idx))], assay=contrast[["assay"]])
# This saves a lot of memory for parallelisation
if (contrast[["slot"]]!="scale.data") contrast[["object"]]@scale.data = new(Class = 'matrix')
}
# Variable latent vars must be passed as data.frame
if (!is.null(contrast[["latent_vars"]]) && length(contrast[["latent_vars"]]) > 0) contrast[["latent_vars"]] = sc[[unique(c(cells_group1_idx,cells_group2_idx)), contrast[["latent_vars"]], drop=FALSE]]
# Now update cell indices so that they match to subset
contrast[["cells_group1_idx_subset"]] = match(colnames(sc)[cells_group1_idx], colnames(contrast[["object"]]))
contrast[["cells_group2_idx_subset"]] = match(colnames(sc)[cells_group2_idx], colnames(contrast[["object"]]))
return(contrast)
})
# Compute the tests; TODO: this chunk may be done in parallel in future
degs_contrasts_results = purrr::map(degs_contrasts_list, function(contrast) {
if (length(contrast$error_messages)==0) {
# No errors, do contrast
test_results = DegsTestCellSets(object=contrast[["object"]],
slot=contrast[["slot"]],
cells_1=colnames(contrast[["object"]])[contrast[["cells_group1_idx_subset"]]],
cells_2=colnames(contrast[["object"]])[contrast[["cells_group2_idx_subset"]]],
is_reduction=contrast[["use_reduction"]],
logfc.threshold=contrast[["log2FC"]],
test.use=contrast[["test"]],
min.pct=contrast[["min_pct"]],
latent.vars=contrast[["latent_vars"]])
} else {
# Errors, return empty data.frame
test_results = DegsEmptyResultsTable()
}
# Sort and filter table
test_results = test_results %>% DegsSort() %>% DegsFilter(contrast[["log2FC"]], contrast[["padj"]], split_by_dir=FALSE)
# Add mean gene expression data (counts or data, dep on slot)
avg.1 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group1_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
avg.2 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group2_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
test_results = cbind(test_results, avg.1, avg.2)
# Add test results and drop unneccessary data
contrast = c(contrast, list(results=test_results))
contrast[["object"]] = NULL
contrast[["cells_group1_idx_subset"]] = NULL
contrast[["cells_group2_idx_subset"]] = NULL
return(contrast)
})
# Also remove objects from deg_contrasts_list (save memory)
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){ contrast[["object"]] = NULL; return(contrast)})
# Compute enrichr results (not in parallel due to server load)
degs_contrasts_results = purrr::map(degs_contrasts_results, function(contrast) {
# Get results table
results_table = contrast$results
# Drop existing results
if ("enrichr" %in% names(contrast)) d[["enrichr"]] = NULL
# Split into up- and downregulated DEGs, then translate to Entrez gene, runEnrichr
degs_up = dplyr::filter(results_table, avg_log2FC > 0) %>% dplyr::pull(gene) %>% unique()
degs_up = sapply(degs_up, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
degs_up = degs_up[!is.na(degs_up)]
enrichr_results_up = EnrichrTest(genes=degs_up, databases=param$enrichr_dbs, padj=param$enrichr_padj)
degs_down = dplyr::filter(results_table, avg_log2FC < 0) %>% dplyr::pull(gene) %>% unique()
degs_down = sapply(degs_down, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
degs_down = degs_down[!is.na(degs_down)]
enrichr_results_down = EnrichrTest(genes=degs_down, databases=param$enrichr_dbs, padj=param$enrichr_padj)
# Flatten both enrichr results into tables
enrichr_results_up = purrr::map_dfr(names(enrichr_results_up), function(n) {
return(cbind(enrichr_results_up[[n]],
list(Database=factor(rep(n, nrow(enrichr_results_up[[n]])), levels=names(enrichr_results_up)),
Direction=factor(rep("up", nrow(enrichr_results_up[[n]])), levels=c("up", "down"))
)
))
})
enrichr_results_down = purrr::map_dfr(names(enrichr_results_down), function(n) {
return(cbind(enrichr_results_down[[n]],
list(Database=factor(rep(n, nrow(enrichr_results_down[[n]])), levels=names(enrichr_results_down)),
Direction=factor(rep("up", nrow(enrichr_results_down[[n]])), levels=c("up", "down"))
)
))
})
# Rbind and add factor levels
enrichr_results = dplyr::bind_rows(enrichr_results_up, enrichr_results_down)
return(c(contrast, list(enrichr=enrichr_results)))
})
# Now regroup list so that subsets are together again
original_contrast_rows = purrr::map_int(degs_contrasts_results, function(contrast){return(contrast[["contrast_row"]]) })
degs = split(degs_contrasts_results, original_contrast_rows)
# Write degs to files
degs_result_files = purrr::map_chr(degs, function(degs_subsets) {
# The output file
file = paste0(param$path_out, "/degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_results.xlsx")
# Write degs
degs_subsets_results = purrr::map(degs_subsets, function(contrast) {return(contrast[["results"]])})
names(degs_subsets_results) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
file = DegsWriteToFile(degs_subsets_results,
annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
file=file,
additional_readme=NULL)
return(file)
})
# Write enrichr results to files
degs_enrichr_files = purrr::map_chr(degs, function(degs_subsets) {
# The output file
file = paste0(param$path_out, "/degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_functions.xlsx")
# Write enrichr results
degs_subsets_enrichr = purrr::map(degs_subsets, function(contrast) {return(contrast[["enrichr"]])})
names(degs_subsets_enrichr) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
file = EnrichrWriteResults(degs_subsets_enrichr, file=file)
return(file)
})knitr_header_string = '
## {{condition_column}}: {{condition_group1}} vs {{condition_group2}}
General configuration:
* assay: {{assay}}
* slot: {{slot}}
* test: {{test}}
* maximum adjusted p-value: {{padj}}
* minimum absolute log2 foldchange: {{log2FC}}
* minimum percentage of cells: {{min_pct}}
Subset on column: \'{{subset_column}}\''
if (length(degs)==0) message("No DEG contrasts specified.")
for (i in seq(degs)) {
# Get subsets
degs_subsets = degs[[i]]
first_contrast = degs_subsets[[1]]
# Create header
cat(
knitr::knit_expand(text = knitr_header_string,
condition_column=first_contrast[["condition_column"]],
condition_group1=first_contrast[["condition_group1"]],
condition_group2=first_contrast[["condition_group2"]],
assay=first_contrast[["assay"]],
slot=first_contrast[["slot"]],
test=first_contrast[["test"]],
padj=first_contrast[["padj"]],
log2FC=first_contrast[["log2FC"]],
min_pct=first_contrast[["min_pct"]],
subset_column=ifelse(is.na(first_contrast[["subset_column"]]), "-", first_contrast[["subset_column"]]))
, '\n')
# Get error messages
error_messages = unique(purrr::flatten_chr(purrr::map(degs_subsets, function(contrast){return(contrast[["error_messages"]])})))
# Create combined results table
degs_subsets_results = purrr::map_dfr(degs_subsets, function(contrast) {
subset_group_value = ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All")
results_table = cbind(contrast[["results"]],
list(subset_group=factor(rep(subset_group_value, nrow(contrast[["results"]])), levels=c(subset_group_value)),
cells1=rep(length(contrast[["cells_group1_idx"]]), nrow(contrast[["results"]])),
cells2=rep(length(contrast[["cells_group2_idx"]]), nrow(contrast[["results"]]))
)
)
return(results_table)
}) %>% tibble::as_tibble()
# Print warnings/errors
if (length(error_messages) > 0) {
warning(error_messages)
}
# Print summary table
cat(
knitr::kable(degs_subsets_results %>%
dplyr::group_by(subset_group) %>%
dplyr::summarise(Cells1=dplyr::first(cells1),
Cells2=dplyr::first(cells2),
DEGs=length(gene),
DEGs_up=sum(avg_log2FC > 0),
DEGs_down=sum(avg_log2FC < 0)),
align="l", caption="DEG summary", col.names=c("Subset", "Cells in group 1", "Cells in group 2", "# DEGs", "# DEGs upregulated", "# DEGs downregulated"), format="html") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
)
cat('\n \n')
} General configuration:
| Subset | Cells in group 1 | Cells in group 2 | # DEGs | # DEGs upregulated | # DEGs downregulated |
|---|---|---|---|---|---|
| All | 3608 | 310 | 3417 | 1150 | 2267 |
General configuration:
| Subset | Cells in group 1 | Cells in group 2 | # DEGs | # DEGs upregulated | # DEGs downregulated |
|---|---|---|---|---|---|
| 1 | 704 | 66 | 1777 | 589 | 1188 |
| 2 | 577 | 48 | 1418 | 459 | 959 |
| 3 | 578 | 34 | 1281 | 394 | 887 |
| 4 | 410 | 24 | 1046 | 295 | 751 |
| 5 | 369 | 44 | 1452 | 495 | 957 |
| 6 | 305 | 32 | 1030 | 344 | 686 |
| 7 | 251 | 18 | 967 | 228 | 739 |
| 8 | 202 | 15 | 848 | 157 | 691 |
| 9 | 122 | 8 | 364 | 56 | 308 |
| 10 | 51 | 16 | 42 | 7 | 35 |
| 11 | 39 | 5 | 99 | 0 | 99 |
General configuration:
| Subset | Cells in group 1 | Cells in group 2 | # DEGs | # DEGs upregulated | # DEGs downregulated |
|---|---|---|---|---|---|
| 1 | 275 | 229 | 6 | 0 | 6 |
| 2 | 214 | 189 | 7 | 0 | 7 |
We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.
# Export UMAP coordinates
loupe_umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe_umap = cbind(Barcode=rownames(loupe_umap), loupe_umap)
colnames(loupe_umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe_umap, file=paste0(param$path_out, "/Seurat2Loupe_umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export categorical metadata
loupe_meta = as.data.frame(sc@meta.data)
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
loupe_meta = cbind(Barcode=rownames(loupe_meta), loupe_meta[, idx_keep])
write.table(x=loupe_meta, file=paste0(param$path_out, "/Seurat2Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets
loupe_genesets = data.frame(List=sapply(markers_filt$up[,"cluster"], function(s) paste0("DEG_up_cluster_", s)),
Name=markers_filt$up[,"gene"],
Ensembl=ifelse(markers_filt$up[,"gene"] %in% names(seurat_rowname_to_ensembl),
seurat_rowname_to_ensembl[markers_filt$up[,"gene"]], markers_filt$up[,"gene"]))
loupe_genesets = rbind(loupe_genesets,
data.frame(List=sapply(markers_filt$down[,"cluster"], function(s) paste0("DEG_down_cluster_", s)),
Name=markers_filt$down[,"gene"],
Ensembl=ifelse(markers_filt$down[,"gene"] %in% names(seurat_rowname_to_ensembl),
seurat_rowname_to_ensembl[markers_filt$down[,"gene"]], markers_filt$down[,"gene"])))
genesets_to_export = list(genes_cc_s_phase=genes_s[,2], genes_cc_g2m_phase=genes_g2m[,2])
for (i in names(genesets_to_export)) {
tmp_genes = genesets_to_export[[i]]
tmp_genes = tmp_genes[tmp_genes %in% names(symbol_to_ensembl)]
loupe_genesets = rbind(loupe_genesets,
data.frame(List=i,
Name=tmp_genes,
Ensembl=ifelse(tmp_genes %in% names(seurat_rowname_to_ensembl),
seurat_rowname_to_ensembl[tmp_genes], tmp_genes)))
}
write.table(loupe_genesets, file=paste0(param$path_out, "/Seurat2Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")We export the assay data, clustering, visualisation, marker genes and enriched pathways in a format that can be read by the Cerebro Browser (romanhaa 2021)
# Top expressed genes
sc = cerebroApp::getMostExpressedGenes(sc,
assay=Seurat::DefaultAssay(sc),
column_sample="orig.ident",
column_cluster="seurat_clusters")
gene_lists_for_cerebro = list()
if (markers_found) {
gene_lists_for_cerebro = split(markers_filt$all$gene, markers_filt$all$cluster)
names(gene_lists_for_cerebro) = paste("Marker cluster", names(gene_lists_for_cerebro))
}
gene_lists_for_cerebro[["G2M_phase_genes"]] = genes_g2m[, 2]
gene_lists_for_cerebro[["S_phase_genes"]] = genes_s[, 2]
gene_lists_for_cerebro[["mitochondrial_genes"]] = grep(param$mt, rownames(sc), v=TRUE)
cerebro_species = gsub("_gene_ensembl", "", param$mart_dataset)
cerebro_species = ifelse(grepl("sapiens", cerebro_species), "Hg", ifelse(grepl("musculus", cerebro_species), "Mm", cerebro_species))
res = ExportToCerebro(sc=sc,
path=paste0(param$path_out,"/cerebro.crb"),
param=param,
project=param$project_id,
species=cerebro_species,
assay=DefaultAssay(sc),
column_sample="orig.ident",
column_cluster="seurat_clusters",
column_ccphase="Phase",
gene_lists=gene_lists_for_cerebro,
marker_genes=markers_filt$all,
enriched_pathways=enriched)All files generated with this report are written into the provided output folder test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/:
# Add colors to the sc object for compatibility with downstream tools
# List names should correspond to meta-data columns
# This could be done for other meta-data as well
sc@misc$colors = list("orig.ident" = param$col_samples,
"seurat_clusters" = param$col_clusters)
save.image(file=paste0(param$path_out, "/scrnaseq.RData"))The following parameters were used to run the workflow.
out = scrnaseq_params_info(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| project_id | pbmc |
| path_data | name:pbmc_10x, pbmc_smartseq2; type:10x, smartseq2; path:test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/, test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz; stats:NA, NA |
| path_out | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/ |
| file_known_markers | test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/known_markers.xlsx |
| mart_dataset | hsapiens_gene_ensembl |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
| mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| mt | ^MT- |
| cell_filter | pbmc_10x:nFeature_RNA=c(200, NA), percent_mt=c(NA, 20); pbmc_smartseq2_sample1:nFeature_RNA=c(200, NA), percent_mt=c(NA, 20) |
| feature_filter | pbmc_10x:min_counts=1, min_cells=3; pbmc_smartseq2_sample1:min_counts=1, min_cells=3 |
| samples_min_cells | 10 |
| norm | RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| cc_rescore_after_merge | TRUE |
| integrate_samples | method:standard; reference_dataset:1; dimensions:30 |
| pc_n | 10 |
| cluster_resolution | 0.5 |
| marker_padj | 0.05 |
| marker_log2FC | 1 |
| marker_pct | 0.25 |
| deg_contrasts | condition_column:orig.ident, orig.ident, Phase; condition_group1:pbmc_10x, pbmc_10x, G1; condition_group2:pbmc_smartseq2_sample1, pbmc_smartseq2_sample1, G2M; subset_column:NA, seurat_clusters, seurat_clusters; subset_group:NA, , 1;2 |
| enrichr_padj | 0.05 |
| enrichr_dbs | GO_Molecular_Function_2018, GO_Biological_Process_2018, GO_Cellular_Component_2018 |
| col | palevioletred |
| col_palette_samples | ggsci::pal_jama |
| col_palette_clusters | ggsci::pal_startrek |
| path_to_git | . |
| debugging_mode | default_debugging |
| file_annot | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//hsapiens_gene_ensembl.v98.annot.txt |
| col_samples | pbmc_10x=#374E55FF, pbmc_smartseq2_sample1=#DF8F44FF |
| col_clusters | 1=#CC0C00FF, 2=#5C88DAFF, 3=#84BD00FF, 4=#FFCD00FF, 5=#7C878EFF, 6=#00B5E2FF, 7=#00AF66FF, 8=#CC0C00B2, 9=#5C88DAB2, 10=#84BD00B2, 11=#FFCD00B2 |
This report was generated using the scrnaseq GitHub repository. Software versions were collected at run time.
out = scrnaseq_session_info(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Name | Version |
|---|---|
| ktrns/scrnaseq | 1eab4da6210401d2ae47032989bc3a63b1bedfa0 |
| R | R version 3.6.1 (2019-07-05) |
| Platform | x86_64-apple-darwin15.6.0 (64-bit) |
| Operating system | macOS Mojave 10.14.6 |
| Packages | abind1.4-5, annotate1.64.0, AnnotationDbi1.48.0, ape5.4-1, askpass1.1, assertthat0.2.1, bibtex0.4.2.3, Biobase2.46.0, BiocFileCache1.10.2, BiocGenerics0.32.0, BiocManager1.30.10, BiocParallel1.20.1, biomaRt2.45.9, bit4.0.4, bit644.0.5, bitops1.0-6, blob1.2.1, cerebroApp1.2.2, cli2.1.0, cluster2.1.0, codetools0.2-16, colorspace1.4-1, colourpicker1.1.0, cowplot1.1.0, crayon1.3.4, curl4.3, data.table1.12.8, DBI1.1.0, dbplyr1.4.4, DelayedArray0.12.3, deldir0.1-29, digest0.6.26, dplyr1.0.2, DT0.16, ellipsis0.3.1, enrichR2.1, evaluate0.14, fansi0.4.1, farver2.0.3, fastmap1.0.1, fitdistrplus1.1-1, formattable0.2.0.1, future1.19.1, future.apply1.6.0, geneplotter1.64.0, generics0.0.2, GenomeInfoDb1.22.1, GenomeInfoDbData1.2.2, GenomicRanges1.38.0, ggplot23.3.2, ggrepel0.8.2, ggridges0.5.2, ggsci2.9, ggtree2.0.4, globals0.13.1, glue1.4.2, goftest1.2-2, graph1.64.0, gridExtra2.3, GSEABase1.48.0, GSVA1.34.0, gtable0.3.0, highr0.8, hms0.5.3, htmltools0.5.0, htmlwidgets1.5.2, httpuv1.5.4, httr1.4.2, ica1.0-2, igraph1.2.6, IRanges2.20.2, irlba2.3.3, jsonlite1.7.1, kableExtra1.2.1, KernSmooth2.23-17, knitcitations1.0.10, knitr1.31, labeling0.4.2, later1.1.0.1, lattice0.20-41, lazyeval0.2.2, leiden0.3.3, lifecycle0.2.0, limma3.42.2, listenv0.8.0, lmtest0.9-38, lubridate1.7.9, magrittr1.5, MASS7.3-53, MAST1.12.0, Matrix1.2-18, matrixStats0.57.0, memoise1.1.0, mgcv1.8-33, mime0.9, miniUI0.1.1.1, msigdbr7.2.1, munsell0.5.0, nlme3.1-149, openssl1.4.3, openxlsx4.2.2, patchwork1.0.1, pbapply1.4-3, pillar1.4.6, pkgconfig2.0.3, plotly4.9.2.1, plyr1.8.6, png0.1-7, polyclip1.10-0, prettyunits1.1.1, progress1.2.2, promises1.1.1, purrr0.3.4, qvalue2.18.0, R62.4.1, RANN2.6.1, rappdirs0.3.1, RColorBrewer1.1-2, Rcpp1.0.5, RcppAnnoy0.0.16, RCurl1.98-1.2, readr1.4.0, RefManageR1.2.12, reshape21.4.4, reticulate1.16, rjson0.2.20, rlang0.4.8, rmarkdown2.4, ROCR1.0-11, rpart4.1-15, RSpectra0.16-0, RSQLite2.2.1, rstudioapi0.11, rsvd1.0.3, Rtsne0.15, rvcheck0.1.8, rvest0.3.6, S4Vectors0.24.4, scales1.1.1, sctransform0.3.1.9002, sessioninfo1.1.1, Seurat3.9.9.9008, shiny1.5.0, shinydashboard0.7.1, shinythemes1.1.2, shinyWidgets0.5.4, SingleCellExperiment1.8.0, spatstat1.64-1, spatstat.data1.4-3, spatstat.utils1.17-0, stringi1.5.3, stringr1.4.0, SummarizedExperiment1.16.1, survival3.2-7, tensor1.5, tibble3.0.4, tidyr1.1.2, tidyselect1.1.0, tidytree0.3.3, treeio1.10.0, uwot0.1.8.9001, vctrs0.3.4, viridis0.5.1, viridisLite0.3.0, webshot0.5.2, withr2.3.0, xfun0.21, XML3.99-0.3, xml21.3.2, xtable1.8-4, XVector0.26.0, yaml2.2.1, zip2.1.1, zlibbioc1.32.0, zoo1.8-8 |
Chen, Edward Y. 2020. “Enrichr.” . https://amp.pharm.mssm.edu/Enrichr/.
Gandolfo, Luke C., and Terence P. Speed. 2018. “RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2). Public Library of Science (PLoS): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and Variance Stabilization of Single-Cell RNA-Seq Data Using Regularized Negative Binomial Regression.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1874-1.
Liu, Fenglin, Yuanyuan Zhang, Lei Zhang, Ziyi Li, Qiao Fang, Ranran Gao, and Zemin Zhang. 2019. “Systematic Comparative Analysis of Single-Nucleotide Variant Detection Methods from Single-Cell RNA Sequencing Data.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1863-4.
romanhaa. 2021. “Romanhaa/cerebroApp.” GitHub. https://github.com/romanhaa/cerebroApp. https://github.com/romanhaa/cerebroApp.
“Satija Lab.” 2020. http://www.satijalab.org/seurat/v3.1/cell_cycle_vignette.html. https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html.
Tirosh, I., B. Izar, S. M. Prakadan, M. H. Wadsworth, D. Treacy, J. J. Trombetta, A. Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282). American Association for the Advancement of Science (AAAS): 189–96. https://doi.org/10.1126/science.aad0501.
“Understanding Umap.” 2019. https://pair-code.github.io/understanding-umap/. https://pair-code.github.io/understanding-umap/.